{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from __future__ import print_function\n",
    "from keras.callbacks import LambdaCallback\n",
    "from keras.models import Sequential\n",
    "from keras.layers import Dense, Activation\n",
    "from keras.layers import LSTM\n",
    "from keras.optimizers import RMSprop, Adam\n",
    "from keras.utils.data_utils import get_file\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import random\n",
    "import sys\n",
    "import io\n",
    "import re\n",
    "from unidecode import unidecode"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "songs = pd.read_csv('data/drake-songs.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_tokenized_lines(df):\n",
    "    words = []\n",
    "    \n",
    "    for index, row in df['lyrics'].iteritems():\n",
    "        row = str(row).lower()\n",
    "        for line in row.split('|-|'):\n",
    "            new_words = re.findall(r\"\\b[a-z']+\\b\", unidecode(line))\n",
    "            words = words + new_words\n",
    "        \n",
    "    return words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_lyric_lines = get_tokenized_lines(songs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total Sequences: 75127\n"
     ]
    }
   ],
   "source": [
    "SEQ_LENGTH = 50 + 1\n",
    "sequences = list()\n",
    "\n",
    "for i in range(SEQ_LENGTH, len(all_lyric_lines)):\n",
    "    seq = all_lyric_lines[i - SEQ_LENGTH: i]\n",
    "    sequences.append(seq)\n",
    "\n",
    "print('Total Sequences: %d' % len(sequences))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def save_doc(lines, filename):\n",
    "    for line in lines:\n",
    "        data = ' '.join(line)\n",
    "        \n",
    "    '\\n'.join(data)\n",
    "    file = open(filename, 'w')\n",
    "    file.write(data)\n",
    "    file.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 430,
   "metadata": {},
   "outputs": [],
   "source": [
    "out_filename = 'sequences.txt'\n",
    "save_doc(sequences, out_filename)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "vocabulary size: 5790\n"
     ]
    }
   ],
   "source": [
    "vocab = set(all_lyric_lines)\n",
    "\n",
    "word_to_index = {w: i for i, w in enumerate(vocab)}\n",
    "index_to_word = {i: w for w, i in word_to_index.items()}\n",
    "word_indices = [word_to_index[word] for word in vocab]\n",
    "vocab_size = len(vocab)\n",
    "\n",
    "print('vocabulary size: {}'.format(vocab_size))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_tokenized_lines(lines, seq_len):\n",
    "    tokenized = np.zeros((len(lines), seq_len))\n",
    "    \n",
    "    for r, line in enumerate(lines):\n",
    "        for c, word in enumerate(line):\n",
    "            tokenized[r, c] = word_to_index[word]\n",
    "\n",
    "    return tokenized"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenized_seq = get_tokenized_lines(sequences, SEQ_LENGTH)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(75127,)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenized_seq[:, -1].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "from keras.utils import to_categorical\n",
    "\n",
    "X, y = tokenized_seq[:, :-1], tokenized_seq[:, -1]\n",
    "y = to_categorical(y, num_classes=vocab_size)\n",
    "seq_length = len(X[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "X_shape (75127, 50)\n",
      "y_shape (75127, 5790)\n"
     ]
    }
   ],
   "source": [
    "print(\"X_shape\", X.shape)\n",
    "print(\"y_shape\", y.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 412,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "embedding_4 (Embedding)      (None, 50, 50)            289500    \n",
      "_________________________________________________________________\n",
      "lstm_7 (LSTM)                (None, 50, 100)           60400     \n",
      "_________________________________________________________________\n",
      "lstm_8 (LSTM)                (None, 100)               80400     \n",
      "_________________________________________________________________\n",
      "dense_7 (Dense)              (None, 100)               10100     \n",
      "_________________________________________________________________\n",
      "dense_8 (Dense)              (None, 5790)              584790    \n",
      "=================================================================\n",
      "Total params: 1,025,190\n",
      "Trainable params: 1,025,190\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n",
      "None\n",
      "Epoch 1/50\n",
      "75127/75127 [==============================] - 207s 3ms/step - loss: 6.4109 - acc: 0.0416\n",
      "Epoch 2/50\n",
      "75127/75127 [==============================] - 204s 3ms/step - loss: 6.0439 - acc: 0.0483\n",
      "Epoch 3/50\n",
      "75127/75127 [==============================] - 208s 3ms/step - loss: 5.8795 - acc: 0.0527\n",
      "Epoch 4/50\n",
      "75127/75127 [==============================] - 207s 3ms/step - loss: 5.7512 - acc: 0.0597\n",
      "Epoch 5/50\n",
      "75127/75127 [==============================] - 208s 3ms/step - loss: 5.6513 - acc: 0.0679\n",
      "Epoch 6/50\n",
      "75127/75127 [==============================] - 207s 3ms/step - loss: 5.5606 - acc: 0.0731\n",
      "Epoch 7/50\n",
      "75127/75127 [==============================] - 211s 3ms/step - loss: 5.4794 - acc: 0.0765\n",
      "Epoch 8/50\n",
      "75127/75127 [==============================] - 210s 3ms/step - loss: 5.4021 - acc: 0.0795\n",
      "Epoch 9/50\n",
      "75127/75127 [==============================] - 208s 3ms/step - loss: 5.3284 - acc: 0.0841\n",
      "Epoch 10/50\n",
      "75127/75127 [==============================] - 208s 3ms/step - loss: 5.2402 - acc: 0.0925\n",
      "Epoch 11/50\n",
      "75127/75127 [==============================] - 209s 3ms/step - loss: 5.1690 - acc: 0.0981\n",
      "Epoch 12/50\n",
      "75127/75127 [==============================] - 209s 3ms/step - loss: 5.1006 - acc: 0.1026\n",
      "Epoch 13/50\n",
      "75127/75127 [==============================] - 207s 3ms/step - loss: 5.0174 - acc: 0.1087\n",
      "Epoch 14/50\n",
      "75127/75127 [==============================] - 204s 3ms/step - loss: 4.9437 - acc: 0.1125\n",
      "Epoch 15/50\n",
      "75127/75127 [==============================] - 204s 3ms/step - loss: 4.8932 - acc: 0.1175\n",
      "Epoch 16/50\n",
      "75127/75127 [==============================] - 208s 3ms/step - loss: 4.8269 - acc: 0.1216\n",
      "Epoch 17/50\n",
      "75127/75127 [==============================] - 209s 3ms/step - loss: 4.8549 - acc: 0.1246\n",
      "Epoch 18/50\n",
      "75127/75127 [==============================] - 208s 3ms/step - loss: 4.8299 - acc: 0.1296\n",
      "Epoch 19/50\n",
      "75127/75127 [==============================] - 207s 3ms/step - loss: 4.7425 - acc: 0.1364\n",
      "Epoch 20/50\n",
      "75127/75127 [==============================] - 207s 3ms/step - loss: 4.6696 - acc: 0.1414\n",
      "Epoch 21/50\n",
      "75127/75127 [==============================] - 206s 3ms/step - loss: 4.5929 - acc: 0.1472\n",
      "Epoch 22/50\n",
      "75127/75127 [==============================] - 210s 3ms/step - loss: 4.5174 - acc: 0.1529\n",
      "Epoch 23/50\n",
      "75127/75127 [==============================] - 206s 3ms/step - loss: 4.4426 - acc: 0.1586\n",
      "Epoch 24/50\n",
      "75127/75127 [==============================] - 206s 3ms/step - loss: 4.3676 - acc: 0.1645\n",
      "Epoch 25/50\n",
      "75127/75127 [==============================] - 210s 3ms/step - loss: 4.2967 - acc: 0.1703\n",
      "Epoch 26/50\n",
      "75127/75127 [==============================] - 210s 3ms/step - loss: 4.2249 - acc: 0.1766\n",
      "Epoch 27/50\n",
      "75127/75127 [==============================] - 207s 3ms/step - loss: 4.1620 - acc: 0.1842\n",
      "Epoch 28/50\n",
      "75127/75127 [==============================] - 207s 3ms/step - loss: 4.0946 - acc: 0.1900\n",
      "Epoch 29/50\n",
      "75127/75127 [==============================] - 208s 3ms/step - loss: 4.0346 - acc: 0.1973\n",
      "Epoch 30/50\n",
      "75127/75127 [==============================] - 209s 3ms/step - loss: 3.9751 - acc: 0.2052\n",
      "Epoch 31/50\n",
      "75127/75127 [==============================] - 208s 3ms/step - loss: 3.9169 - acc: 0.2117\n",
      "Epoch 32/50\n",
      "75127/75127 [==============================] - 206s 3ms/step - loss: 3.8584 - acc: 0.2194\n",
      "Epoch 33/50\n",
      "75127/75127 [==============================] - 207s 3ms/step - loss: 3.8085 - acc: 0.2262\n",
      "Epoch 34/50\n",
      "75127/75127 [==============================] - 208s 3ms/step - loss: 3.7586 - acc: 0.2332\n",
      "Epoch 35/50\n",
      "75127/75127 [==============================] - 210s 3ms/step - loss: 3.7061 - acc: 0.2420\n",
      "Epoch 36/50\n",
      "75127/75127 [==============================] - 207s 3ms/step - loss: 3.6632 - acc: 0.2474\n",
      "Epoch 37/50\n",
      "75127/75127 [==============================] - 207s 3ms/step - loss: 3.6192 - acc: 0.2541\n",
      "Epoch 38/50\n",
      "75127/75127 [==============================] - 207s 3ms/step - loss: 3.5739 - acc: 0.2599\n",
      "Epoch 39/50\n",
      "75127/75127 [==============================] - 207s 3ms/step - loss: 3.5310 - acc: 0.2664\n",
      "Epoch 40/50\n",
      "75127/75127 [==============================] - 209s 3ms/step - loss: 3.4926 - acc: 0.2737\n",
      "Epoch 41/50\n",
      "75127/75127 [==============================] - 208s 3ms/step - loss: 3.4551 - acc: 0.2780\n",
      "Epoch 42/50\n",
      "75127/75127 [==============================] - 208s 3ms/step - loss: 3.4198 - acc: 0.2834\n",
      "Epoch 43/50\n",
      "75127/75127 [==============================] - 212s 3ms/step - loss: 3.3818 - acc: 0.2905\n",
      "Epoch 44/50\n",
      "75127/75127 [==============================] - 210s 3ms/step - loss: 3.3459 - acc: 0.2954\n",
      "Epoch 45/50\n",
      "75127/75127 [==============================] - 208s 3ms/step - loss: 3.3102 - acc: 0.3012\n",
      "Epoch 46/50\n",
      "75127/75127 [==============================] - 207s 3ms/step - loss: 3.2782 - acc: 0.3065\n",
      "Epoch 47/50\n",
      "75127/75127 [==============================] - 206s 3ms/step - loss: 3.2436 - acc: 0.3115\n",
      "Epoch 48/50\n",
      "75127/75127 [==============================] - 207s 3ms/step - loss: 3.2102 - acc: 0.3175\n",
      "Epoch 49/50\n",
      "75127/75127 [==============================] - 207s 3ms/step - loss: 3.1825 - acc: 0.3213\n",
      "Epoch 50/50\n",
      "75127/75127 [==============================] - 205s 3ms/step - loss: 3.1430 - acc: 0.3269\n"
     ]
    }
   ],
   "source": [
    "from keras.models import Sequential\n",
    "from keras.layers import Dense\n",
    "from keras.layers import LSTM\n",
    "from keras.layers import Embedding\n",
    "\n",
    "model = Sequential()\n",
    "model.add(Embedding(vocab_size, 50, input_length=seq_length))\n",
    "model.add(LSTM(100, return_sequences=True))\n",
    "model.add(LSTM(100))\n",
    "model.add(Dense(100, activation='relu'))\n",
    "model.add(Dense(vocab_size, activation='softmax'))\n",
    "print(model.summary())\n",
    "\n",
    "model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
    "history = model.fit(X, y, batch_size=128, epochs=50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'history' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-18-2325e93fc327>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mplot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhistory\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhistory\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'loss'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m: name 'history' is not defined"
     ]
    }
   ],
   "source": [
    "plt.plot(history.history['loss'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 413,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'dump' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-413-ae8f86de54c6>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'model.h5'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0;31m# save the tokenizer\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mdump\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtokenizer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'tokenizer.pkl'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'wb'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m: name 'dump' is not defined"
     ]
    }
   ],
   "source": [
    "model.save('model.h5')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "from keras.models import load_model\n",
    "\n",
    "model = load_model('model.h5')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_doc(filename):\n",
    "    file = open(filename, 'r')\n",
    "    text = file.read()\n",
    "    file.close()\n",
    "    return text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load cleaned text sequences\n",
    "in_filename = 'data/clean_sequences.txt'\n",
    "doc = load_doc(in_filename)\n",
    "lines = doc.split('\\n')\n",
    "seq_length = SEQ_LENGTH - 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "and don't wanna diss niggas when write no more and don't wanna bring chicks to the crib to watch flick in the basement on the bed with dim lights no more now i'm losin feeling losin appealing need to step it up like prints from shoes on the ceiling oh damn\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from random import randint\n",
    "\n",
    "seed_text = lines[randint(0,len(lines))]\n",
    "print(seed_text + '\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 139,
   "metadata": {},
   "outputs": [],
   "source": [
    "def texts_to_sequences(texts, word_to_index):\n",
    "    indices = np.zeros((1, len(texts)), dtype=int)\n",
    "    \n",
    "    for i, text in enumerate(texts):\n",
    "        indices[:, i] = word_to_index[text]\n",
    "        \n",
    "    return indices"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "metadata": {},
   "outputs": [],
   "source": [
    "def my_pad_sequences(seq, maxlen):\n",
    "    start = seq.shape[1] - maxlen\n",
    "    \n",
    "    return seq[:, start: start + maxlen]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 142,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_seq(model, word_to_index, seq_length, seed_text, n_words):\n",
    "    result = list()\n",
    "    in_text = seed_text\n",
    "\n",
    "    for _ in range(n_words):\n",
    "        encoded = texts_to_sequences(in_text.split()[1:], word_to_index)\n",
    "        encoded = my_pad_sequences(encoded, maxlen=seq_length)\n",
    "        \n",
    "        yhat = model.predict_classes(encoded, verbose=0)\n",
    "        out_word = ''\n",
    "    \n",
    "        for word, index in word_to_index.items():\n",
    "            if index == yhat:\n",
    "                out_word = word\n",
    "                break\n",
    "        \n",
    "        in_text += ' ' + out_word\n",
    "        result.append(out_word)\n",
    "        \n",
    "    return ' '.join(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "hahahaha i've banged calling lightskin matters cooking bill even it'd sweaters h overlook conversation banged pullin eatin recognize freshman six matters wolves resemble buddin mad growin sleepy cosign sum calling shut lie representing wits calling remixing sewed shrimp supported supported females luckily banged learn cater learn attempt representing matters rosetta\n"
     ]
    }
   ],
   "source": [
    "generated = generate_seq(model, word_to_index, seq_length, seed_text, 50)\n",
    "print(generated)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
